In [3]:
%matplotlib inline
In [5]:
# Monkey patch SSL as proxy is not working
import ssl
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
In [6]:
from matplotlib import pyplot as plt
import numpy as np
import pymc3 as pm
from pymc3.distributions.timeseries import GaussianRandomWalk
import seaborn as sns
from statsmodels import datasets
from theano import tensor as T
In [7]:
df = datasets.get_rdataset('mastectomy', 'HSAUR', cache=True).data
df.event = df.event.astype(np.int64)
df.metastized = (df.metastized == 'yes').astype(np.int64)
n_patients = df.shape[0]
patients = np.arange(n_patients)
In [8]:
df.head()
Out[8]:
In [9]:
n_patients
Out[9]:
In [10]:
df.event.mean()
Out[10]:
In [11]:
fig, ax = plt.subplots(figsize=(8, 6))
blue, _, red = sns.color_palette()[:3]
ax.hlines(patients[df.event.values == 0], 0, df[df.event.values == 0].time,
color=blue, label='Censored');
ax.hlines(patients[df.event.values == 1], 0, df[df.event.values == 1].time,
color=red, label='Uncensored');
ax.scatter(df[df.metastized.values == 1].time, patients[df.metastized.values == 1],
color='k', zorder=10, label='Metastized');
ax.set_xlim(left=0);
ax.set_xlabel('Months since mastectomy');
ax.set_ylim(-0.25, n_patients + 0.25);
ax.legend(loc='center right');
In [12]:
# Bayesian proportional hazards model
interval_length = 3
interval_bounds = np.arange(0, df.time.max() + interval_length + 1, interval_length)
n_intervals = interval_bounds.size - 1
intervals = np.arange(n_intervals)
In [13]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.hist(df[df.event == 1].time.values, bins=interval_bounds,
color=red, alpha=0.5, lw=0,
label='Uncensored');
ax.hist(df[df.event == 0].time.values, bins=interval_bounds,
color=blue, alpha=0.5, lw=0,
label='Censored');
ax.set_xlim(0, interval_bounds[-1]);
ax.set_xlabel('Months since mastectomy');
ax.set_yticks([0, 1, 2, 3]);
ax.set_ylabel('Number of observations');
ax.legend();
In [36]:
last_period = np.array(np.floor((df.time - 0.01) / interval_length), dtype=int)
death = np.zeros((n_patients, n_intervals))
death[patients, last_period] = df.event
In [37]:
exposure = np.greater_equal.outer(df.time, interval_bounds[:-1]) * interval_length
exposure[patients, last_period] = df.time - interval_bounds[last_period]
In [38]:
SEED = 1087
In [39]:
with pm.Model() as model:
lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)
sigma = pm.Uniform('sigma', 0., 10.)
tau = pm.Deterministic('tau', sigma**-2)
mu_beta = pm.Normal('mu_beta', 0., 10**-2)
beta = pm.Normal('beta', mu_beta, tau)
lambda_ = pm.Deterministic('lambda_', T.outer(T.exp(beta * df.metastized), lambda0))
mu = pm.Deterministic('mu', exposure * lambda_)
obs = pm.Poisson('obs', mu, observed=death)
In [40]:
n_samples = 40000
burn = 20000
thin = 20
In [41]:
with model:
step = pm.Metropolis()
trace_ = pm.sample(n_samples, step, random_seed=SEED)
In [42]:
trace = trace_[burn::thin]
In [43]:
np.exp(trace['beta'].mean())
Out[43]:
In [47]:
pm.traceplot(trace, varnames=['beta']);
In [48]:
pm.autocorrplot(trace, varnames=['beta']);
In [49]:
base_hazard = trace['lambda0']
met_hazard = trace['lambda0'] * np.exp(np.atleast_2d(trace['beta']).T)
In [50]:
def cum_hazard(hazard):
return (interval_length * hazard).cumsum(axis=-1)
def survival(hazard):
return np.exp(-cum_hazard(hazard))
In [51]:
def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05):
mean = f(hazard.mean(axis=0))
percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
hpd = np.percentile(f(hazard), percentiles, axis=0)
ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
ax.step(x, mean, color=color, label=label);
In [52]:
fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6))
plot_with_hpd(interval_bounds[:-1], base_hazard, cum_hazard,
hazard_ax, color=blue, label='Had not metastized')
plot_with_hpd(interval_bounds[:-1], met_hazard, cum_hazard,
hazard_ax, color=red, label='Metastized')
hazard_ax.set_xlim(0, df.time.max());
hazard_ax.set_xlabel('Months since mastectomy');
hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');
hazard_ax.legend(loc=2);
plot_with_hpd(interval_bounds[:-1], base_hazard, survival,
surv_ax, color=blue)
plot_with_hpd(interval_bounds[:-1], met_hazard, survival,
surv_ax, color=red)
surv_ax.set_xlim(0, df.time.max());
surv_ax.set_xlabel('Months since mastectomy');
surv_ax.set_ylabel('Survival function $S(t)$');
fig.suptitle('Bayesian survival model');
In [53]:
with pm.Model() as time_varying_model:
lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)
beta = GaussianRandomWalk('beta', tau=1., shape=n_intervals)
lambda_ = pm.Deterministic('h', lambda0 * T.exp(T.outer(T.constant(df.metastized), beta)))
mu = pm.Deterministic('mu', exposure * lambda_)
obs = pm.Poisson('obs', mu, observed=death)
In [54]:
with time_varying_model:
step = pm.Metropolis()
time_varying_trace_ = pm.sample(n_samples, step, random_seed=SEED)
In [55]:
time_varying_trace = time_varying_trace_[burn::thin]
In [56]:
fig, ax = plt.subplots(figsize=(8, 6))
beta_hpd = np.percentile(time_varying_trace['beta'], [2.5, 97.5], axis=0)
beta_low = beta_hpd[0]
beta_high = beta_hpd[1]
ax.fill_between(interval_bounds[:-1], beta_low, beta_high,
color=blue, alpha=0.25);
beta_hat = time_varying_trace['beta'].mean(axis=0)
ax.step(interval_bounds[:-1], beta_hat, color=blue);
ax.scatter(interval_bounds[last_period[(df.event.values == 1) & (df.metastized == 1)]],
beta_hat[last_period[(df.event.values == 1) & (df.metastized == 1)]],
c=red, zorder=10, label='Died, cancer metastized');
ax.scatter(interval_bounds[last_period[(df.event.values == 0) & (df.metastized == 1)]],
beta_hat[last_period[(df.event.values == 0) & (df.metastized == 1)]],
c=blue, zorder=10, label='Censored, cancer metastized');
ax.set_xlim(0, df.time.max());
ax.set_xlabel('Months since mastectomy');
ax.set_ylabel(r'$\beta_j$');
ax.legend();
In [57]:
tv_base_hazard = time_varying_trace['lambda0']
tv_met_hazard = time_varying_trace['lambda0'] * np.exp(np.atleast_2d(time_varying_trace['beta']))
In [58]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.step(interval_bounds[:-1], cum_hazard(base_hazard.mean(axis=0)),
color=blue, label='Had not metastized');
ax.step(interval_bounds[:-1], cum_hazard(met_hazard.mean(axis=0)),
color=red, label='Metastized');
ax.step(interval_bounds[:-1], cum_hazard(tv_base_hazard.mean(axis=0)),
color=blue, linestyle='--', label='Had not metastized (time varying effect)');
ax.step(interval_bounds[:-1], cum_hazard(tv_met_hazard.mean(axis=0)),
color=red, linestyle='--', label='Metastized (time varying effect)');
ax.set_xlim(0, df.time.max() - 4);
ax.set_xlabel('Months since mastectomy');
ax.set_ylim(0, 2);
ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');
ax.legend(loc=2);
In [59]:
fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6))
plot_with_hpd(interval_bounds[:-1], tv_base_hazard, cum_hazard,
hazard_ax, color=blue, label='Had not metastized')
plot_with_hpd(interval_bounds[:-1], tv_met_hazard, cum_hazard,
hazard_ax, color=red, label='Metastized')
hazard_ax.set_xlim(0, df.time.max());
hazard_ax.set_xlabel('Months since mastectomy');
hazard_ax.set_ylim(0, 2);
hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');
hazard_ax.legend(loc=2);
plot_with_hpd(interval_bounds[:-1], tv_base_hazard, survival,
surv_ax, color=blue)
plot_with_hpd(interval_bounds[:-1], tv_met_hazard, survival,
surv_ax, color=red)
surv_ax.set_xlim(0, df.time.max());
surv_ax.set_xlabel('Months since mastectomy');
surv_ax.set_ylabel('Survival function $S(t)$');
fig.suptitle('Bayesian survival model with time varying effects');
In [ ]: